Objetivo:
Avaliar a classificação de séries temporais usando 3 diferentes abordagens, incluindo nossa hipótese, de codificar séries temporais através do Gráfico de Recorrência;
Cenário Comparativo
:
Dados: Considerando os dados estabelecidos no Benchmarking 1;
Atributos *(Feature Space)*: representação vetorial das amostras;
Método de Classificação: Rede Neural Multi-camadas (MLP, sem hiperparametrização - pacote Scikit-lean);
Métrica(s): uma vez que o problema irá ser tratado como classificação multi-rótulo, irão ser adotadas as seguintes métricas (via pacote Scikit-learn.metrics)
In [8]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rc('text', usetex=False)
from matplotlib.image import imsave
import pandas as pd
import pickle as cPickle
import os, sys, cv2
from math import *
from pprint import pprint
from tqdm import tqdm_notebook
from mpl_toolkits.axes_grid1 import make_axes_locatable
from PIL import Image
from glob import glob
from IPython.display import display
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, hamming_loss
from pyts.image import RecurrencePlot
REDD_RESOURCES_PATH = 'datasets/REDD'
BENCHMARKING1_RESOURCES_PATH = 'benchmarkings/cs446 project-electric-load-identification-using-machine-learning/'
BENCHMARKING2_RESOURCES_PATH = 'benchmarkings/Imaging-NILM-time-series/'
HYPOTHESIS_RESOURCES_PATH = 'datasets/hipotese1-recurrenceplot-vggembedding/'
sys.path.append(os.path.join(BENCHMARKING1_RESOURCES_PATH, ''))
sys.path.append(os.path.join(BENCHMARKING2_RESOURCES_PATH, ''))
sys.path.append(os.path.join(HYPOTHESIS_RESOURCES_PATH, ''))
from serie2QMlib import *
import warnings
warnings.filterwarnings(action='ignore')
In [9]:
# devices to be used in training and testing
use_idx = np.array([3,4,6,7,10,11,13,17,19])
label_columns_idx = ["APLIANCE_{}".format(i) for i in use_idx]
appliance_labels = [
"Electronics",
"Refrigerator",
"Dishwasher",
"Furnace",
"Washer Dryer 1",
"Washer Dryer 2",
"Microwave",
"Bathroom GFI",
"Kitchen Outlets"
]
In [10]:
Xb1_train = np.load( os.path.join(BENCHMARKING1_RESOURCES_PATH, 'datasets/train_instances.npy') )
yb1_train = np.load( os.path.join(BENCHMARKING1_RESOURCES_PATH, 'datasets/train_labels_binary.npy') )
Xb1_test = np.load( os.path.join(BENCHMARKING1_RESOURCES_PATH, 'datasets/test_instances.npy') )
yb1_test = np.load( os.path.join(BENCHMARKING1_RESOURCES_PATH, 'datasets/test_labels_binary.npy') )
In [11]:
Xb2_train = np.load( os.path.join(BENCHMARKING2_RESOURCES_PATH, 'datasets/X_train.npy') )
yb2_train = np.load( os.path.join(BENCHMARKING2_RESOURCES_PATH, 'datasets/y_train.npy') )
Xb2_test = np.load( os.path.join(BENCHMARKING2_RESOURCES_PATH, 'datasets/X_test.npy') )
yb2_test = np.load( os.path.join(BENCHMARKING2_RESOURCES_PATH, 'datasets/y_test.npy') )
In [12]:
Xh1_train = np.load( os.path.join(HYPOTHESIS_RESOURCES_PATH, 'X_train.npy') )
yh1_train = np.load( os.path.join(HYPOTHESIS_RESOURCES_PATH, 'y_train.npy') )
Xh1_test = np.load( os.path.join(HYPOTHESIS_RESOURCES_PATH, 'X_test.npy') )
yh1_test = np.load( os.path.join(HYPOTHESIS_RESOURCES_PATH, 'y_test.npy') )
In [13]:
"""
References:
- https://contentlab.io/using-rqa-and-neural-networks-to-analyze-complex-time-series/
- https://github.com/JuliaDynamics/RecurrenceAnalysis.jl/wiki/Comparison-of-software-packages-for-RQA
- PyRQA
- https://stackoverflow.com/questions/43077427/python-pyopencl-import-error
"""
# # Erro:
# RuntimeError: clGetDeviceInfo failed: OUT_OF_RESOURCES
# # Solução:
# 1. Adicionar o comando 'if self.verbose:' em ~\anaconda3\envs\doutorado\lib\site-packages\pyrqa\opencl.py na linha 262
# 2. Implementar try_or function em ~\anaconda3\envs\doutorado\lib\site-packages\pyrqa\opencl.py e alterar linha 510-511 (função get_device_info)
# def try_or(fn, default):
# try:
# return fn()
# except:
# return default
#
# import pyopencl as cl # Import the OpenCL GPU computing API
# print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices')
# for platform in cl.get_platforms(): # Print each platform on this computer
# print('=' * 60)
# print('Platform - Name: ' + platform.name)
# print('Platform - Vendor: ' + platform.vendor)
# print('Platform - Version: ' + platform.version)
# print('Platform - Profile: ' + platform.profile)
# for device in platform.get_devices(): # Print each device per-platform
# print(' ' + '-' * 56)
# print(' Device - Name: ' + device.name)
# print(' Device - Type: ' + cl.device_type.to_string(device.type))
# #print(' Device - Max Clock Speed: {0} Mhz'.format(device.max_clock_frequency))
# print(' Device - Max Clock Speed: {0} Mhz'.format( try_or(lambda: device.max_clock_frequency, '<n/a>') ) )
# print(' Device - Compute Units: {0}'.format(device.max_compute_units))
# print(' Device - Local Memory: {0:.0f} KB'.format(device.local_mem_size/1024))
# print(' Device - Constant Memory: {0:.0f} KB'.format(device.max_constant_buffer_size/1024))
# print(' Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size/1073741824.0))
# print('\n')
Out[13]:
In [18]:
from pyrqa.time_series import TimeSeries
from pyrqa.settings import Settings
from pyrqa.computing_type import ComputingType
from pyrqa.neighbourhood import FixedRadius
from pyrqa.metric import EuclideanMetric
from pyrqa.computation import RQAComputation
def calculate_rqa(series, result_format = 'pandas'):
rqa_data = []
for serie in tqdm_notebook(series):
settings = Settings(
TimeSeries(
serie,
embedding_dimension=1, time_delay=1 # Same series from PyTS package used in our hypothesis (https://pyts.readthedocs.io/en/latest/generated/pyts.image.RecurrencePlot.html#pyts.image.RecurrencePlot)
), # Performing RP conversions from TS
computing_type=ComputingType.Classic,
neighbourhood=FixedRadius(0.65),
similarity_measure=EuclideanMetric,
theiler_corrector=1
)
rqa_result = RQAComputation.create(settings,verbose=False).run()
rqa_data.append(rqa_result.to_array()[3:])
if result_format == 'pandas':
return pd.DataFrame(
data = rqa_data,
columns = [
"Recurrence rate (RR)",
"Determinism (DET)",
"Average diagonal line length (L)",
"Longest diagonal line length (L_max)",
"Divergence (DIV)",
"Entropy diagonal lines (L_entr)",
"Laminarity (LAM)",
"Trapping time (TT)",
"Longest vertical line length (V_max)",
"Entropy vertical lines (V_entr)",
"Average white vertical line length (W)",
"Longest white vertical line length (W_max)",
"Longest white vertical line length inverse (W_div)",
"Entropy white vertical lines (W_entr)",
"Ratio determinism / recurrence rate (DET/RR)",
"Ratio laminarity / determinism (LAM/DET)"
]
)
else:
return rqa_data
# # Testing RQA calc...
# data_points = [
# [0.1, 0.5, 1.3, 0.7, 0.8, 1.4, 1.6, 1.2, 0.4, 1.1, 0.8, 0.2, 1.3],
# [1.6, 1.2, 0.4, 1.1, 0.8, 0.2, 1.3, 0.1, 0.5, 1.3, 0.7, 0.8, 1.4]
# ]
# rqa_series = calculate_rqa(data_points)
# rqa_series
In [19]:
print("Calculating RQA from chunked series dataset (train / test)...")
# Train...
Xh2_train = calculate_rqa(
np.load( os.path.join(BENCHMARKING1_RESOURCES_PATH, 'datasets/train_power_chunks.npy') )
).replace([np.inf, -np.inf], np.nan).fillna(0)
Xh2_train.to_csv( os.path.join(HYPOTHESIS_RESOURCES_PATH, 'df_rqa_train.csv') )
# Test...
Xh2_test = calculate_rqa(
np.load( os.path.join(BENCHMARKING1_RESOURCES_PATH, 'datasets/test_power_chunks.npy') )
).replace([np.inf, -np.inf], np.nan).fillna(0)
Xh2_test.to_csv( os.path.join(HYPOTHESIS_RESOURCES_PATH, 'df_rqa_test.csv') )
In [62]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import multilabel_confusion_matrix
# def metrics(test, predicted):
# return acc, prec, rec, f1, f1m, hl
def plot_predicted_and_ground_truth(test, predicted):
#import matplotlib.pyplot as plt
plt.plot(predicted.flatten(), label = 'pred')
plt.plot(test.flatten(), label= 'Y')
plt.show();
return
def classification_report(y_test, y_pred, labels = None):
final_performance = []
for i in range(y_test.shape[1]):
test = y_test[:, i]
predicted = y_pred[:, i]
#acc, prec, rec, f1, f1m, hl, supp = metrics(y_test[:, i], y_pred[:, i])
acc = accuracy_score(test, predicted)
prec = precision_score(test, predicted)
rec = recall_score(test, predicted)
f1 = f1_score(test, predicted)
f1m = f1_score(test, predicted, average='macro')
hl = hamming_loss(test, predicted)
supp = y_test.shape[0]
final_performance.append([
labels[i] if labels is not None else label_columns_idx[i],
round(acc*100, 2),
round(prec*100, 2),
round(rec*100, 2),
round(f1*100, 2),
round(f1m*100, 2),
round(hl, 2),
supp
])
print("CLASSIFIER PERFORMANCE BY APPLIANCE (LABEL):")
df_metrics = pd.DataFrame(
data = final_performance,
columns = ["Appliance", "Accuracy", "Precision", "Recall", "F1-score", "F1-macro", "Hamming Loss", "Support"]
)
display(df_metrics)
print("")
print("OVERALL AVERAGE PERFORMANCE:")
display(df_metrics.describe().round(2).loc[['mean','max','min']])
print("")
print("CONFUSION MATRIX (OFF/ON), BY APPLIANCE:")
cms = multilabel_confusion_matrix(y_test, y_pred)
for i, a in enumerate(appliance_labels):
print("")
print(" - {}:".format(a))
print(cms[i])
#print(, labels= appliance_labels)
# Default model classifiers
mlp_classifier = Pipeline([
# ('scaler', StandardScaler()),
('clf', MLPClassifier( random_state = 33))
])
svm_classifier = Pipeline([
# ('scaler', StandardScaler()),
('clf', SVC())
])
In [63]:
model_b1 = mlp_classifier#RandomForestClassifier(n_estimators=10)#DecisionTreeClassifier(max_depth=15)
model_b1.fit(Xb1_train, yb1_train)
y_test = np.array(yb1_test)
y_pred = np.array(model_b1.predict(Xb1_test))
classification_report(y_test, y_pred, labels = appliance_labels)
In [64]:
model_b2 = mlp_classifier#RandomForestClassifier(n_estimators=10)#DecisionTreeClassifier(max_depth=15)
model_b2.fit(Xb2_train, yb2_train)
y_test = np.array(yb2_test)
y_pred = np.array(model_b2.predict(Xb2_test).astype(int))
classification_report(y_test, y_pred, labels = appliance_labels)
In [65]:
model_h1 = mlp_classifier#RandomForestClassifier(n_estimators=10)#DecisionTreeClassifier(max_depth=15)
model_h1.fit(Xh1_train, yh1_train)
y_test = np.array(yh1_test)
y_pred = np.array(model_h1.predict(Xh1_test))
classification_report(y_test, y_pred, labels = appliance_labels)
In [66]:
model_h2 = mlp_classifier#RandomForestClassifier(n_estimators=10)#DecisionTreeClassifier(max_depth=15)
model_h2.fit(Xh2_train, yh1_train)
y_test = np.array(yh1_test)
y_pred = np.array(model_h2.predict(Xh2_test))
classification_report(y_test, y_pred, labels = appliance_labels)
...( Próximo passo, avaliar classificador SVM com suporte a classificação multi-rótulo)...
A utilização de RPs para a classificação multirótulo de cargas, no contexto descrito, demonstra os melhores resultados para as métricas Acurácia, Precisão, F1-score ponderado (macro) e Hamming Loss.
In [ ]: